1 Setup

library("tidyverse")
library("here")
library("assertthat")
library("tidymodels")
library("xgboost")
library("tictoc")
library("gt")
library("vip")
library("futile.logger")
tidymodels_prefer() 
conflicted::conflict_prefer("spec", "yardstick")
conflicted::conflict_prefer("discard", "purrr")

source(here("src", "01-constants.R"))

cores <- parallel::detectCores()
doParallel::registerDoParallel()

2 Init logger

## NULL
## NULL

3 Train and Test

Dataset used: real

Dataset path: /Users/sebastiansaueruser/Google Drive/research/Covid19/covid-icu-old/data/processed/data-prepared.csv

3.1 Import data

stopifnot(file.exists(data_processed_path))

d <- read_csv(data_processed_path)
flog.info("Processed data was read.")
flog.info(paste0("Dimensions of data set read are: ", str_c(dim(d), collapse = ", ")))
flog.info(paste0("Data path: ", data_processed_path))

Dimensions of the data set:

dim(d)
## [1] 665  65

3.2 DV as factor

Define DV as factor, since we are modelling a classification setting:

d <-
  d %>% 
  mutate(verlauf = factor(verlauf))

3.3 Check

3.3.1 NA per column/variable

List of columns with missing values

d %>% 
  map_dfr( ~ sum(is.na(.))) %>% 
  pivot_longer(everything()) %>% 
  arrange(-value) %>% 
  filter(value > 0) %>% 
  gt()
name value
ro 55
neutros_nl 38
lymphos_nl 38
nlr 38
ven_laktat_mmol_l 37
ven_bga_p_co2_mm_hg 35
vbga_p_o2_mm_hg 35
procalcitonin_ng_m_l 32
bg 25
bilirubin_ges_mg_d_l 21
alt_u_l 16
ast_u_l 15
ldh_u_l 13
q_sofa 10
harnstoff_mg_dl 9
af 8
quick_percent 8
esi 3
e_gfr_ml_min 3
hf 2
bp_dia 2
temp 2
thrombos_nl 2
bp_sys 1
gcs 1
supportive_o2 1
leukos_nl 1
crea_mg_dl 1
akutes_nierenversagen 1

Number of observations when all NAs (in all columns) are removed:

d %>% 
  drop_na() %>% 
  nrow()
## [1] 511

3.4 Splitting

set.seed(42)
d_split <- initial_split(d, prop = .8, strata = verlauf)
d_split
## <Analysis/Assess/Total>
## <531/134/665>
d_train <- training(d_split)
d_test <- training(d_split)

4 Define workflow

4.1 Models

4.1.1 Majority model

d_train %>% 
  count(verlauf) %>% 
  mutate(verlauf_prop = n/sum(n)) %>% 
  gt() %>% 
  fmt_number(where(is.numeric))
verlauf n verlauf_prop
bad 109.00 0.21
good 422.00 0.79

Hence, the Accuracy performance measure in this model is the larger value of the proportion.

4.1.2 XGB

xgb_mod <- 
  boost_tree(
  trees = 1000, 
  tree_depth = tune(), 
  min_n = tune(), 
  loss_reduction = tune(),  ## first three: model complexity
  sample_size = tune(), mtry = tune(), ## randomness
  learn_rate = tune(),  ## step size
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

4.1.3 RF

rf_mod <- 
  rand_forest(mtry = tune(), 
              min_n = tune(), 
              trees = 1000) %>% 
  set_engine("ranger", 
             num.threads = cores, 
             importance = "permutation") %>% 
  set_mode("classification")

4.1.4 Logistic regression

logistic_mod <-
  logistic_reg()  # engine is set by default

4.2 CV

10 folds, 3 repeats.

folds_spec <-
  vfold_cv(d_train, repeats  = 3)

4.3 Recipe

basic_recipe <-
  recipe(verlauf ~ ., data = d_train) %>% 
  update_role(id, new_role = "ID") %>% 
  update_role(all_of(forbidden_vars), new_role = "ID") %>% 
  step_zv(all_numeric(), -all_outcomes()) %>% 
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_corr(all_predictors(), threshold = 0.7, method = "spearman") %>% 
  step_impute_knn(all_predictors())

4.4 Workflows

xgb_wf <-
  workflow() %>% 
  add_model(xgb_mod) %>% 
  add_recipe(basic_recipe)
rf_wf <-
  workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(basic_recipe)

rf_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## • step_corr()
## • step_impute_knn()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
## 
## Engine-Specific Arguments:
##   num.threads = cores
##   importance = permutation
## 
## Computational engine: ranger
logistic_wf <-
  workflow() %>% 
  add_model(logistic_mod) %>% 
  add_recipe(basic_recipe)

4.5 Define performance metrics

class_metrics <- 
  metric_set(
      recall, 
      precision, 
      f_meas, 
      accuracy, 
      kap,
      roc_auc, 
      sens, 
      spec,
      ppv
  )

4.6 Fit resamples/tune

4.6.1 XGB

xgb_grid <- 
  grid_latin_hypercube(
    tree_depth(),
    min_n(),
    loss_reduction(),
    sample_size = sample_prop(),
    finalize(mtry(), d_train),
    learn_rate(),
    size = 30
  )
doParallel::registerDoParallel()

if (rerun_all == FALSE & file.exists(paste0(here::here(xgb01_outputfile)))){
  xgb_res <- read_rds(file = xgb01_outputfile)
} else {
  tic()
  cat("Computing XGB with tuning grid\n")
  xgb_res <-
    xgb_wf %>% 
    tune_grid(
      xgb_wf,
      resamples = folds_spec,
      metrics = metric_set(
        recall, 
        precision, 
        f_meas, 
        accuracy, 
        kap,
        roc_auc, 
        sens, 
        spec,
        ppv
  ),
      grid = xgb_grid,
      control = control_grid(save_pred = TRUE)
    )
  toc()
  write_rds(xgb_res, file = xgb01_outputfile)
} 

4.6.2 RF

rf_mod
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
## 
## Engine-Specific Arguments:
##   num.threads = cores
##   importance = permutation
## 
## Computational engine: ranger
if (rerun_all == FALSE & file.exists(paste0(here::here(rf01_outputfile)))){
  flog.info("Reading RDS file for Random Forest 01 model.")
  rf_res <- read_rds(file = rf01_outputfile)
} else {
  flog.info("Computing Random Forest 01 model.")
  tic()
  cat("Computing RF with tuning grid\n")
  rf_res <-
    rf_wf %>% 
    tune_grid(
      resamples = folds_spec,
      grid = 25,
      control = control_grid(save_pred = TRUE),
      metrics = metric_set(
        recall, 
        precision, 
        f_meas, 
        accuracy, 
        kap,
        roc_auc, 
        sens, 
        spec,
        ppv
  ))
  toc()
  write_rds(rf_res, file = rf01_outputfile)
}

4.6.3 Logistic regression

logistic_res <-
  logistic_wf %>% 
   fit_resamples(
      resamples = folds_spec,
      control = control_grid(save_pred = TRUE),
      metrics = metric_set(
        recall, 
        precision, 
        f_meas, 
        accuracy, 
        kap,
        roc_auc, 
        sens, 
        spec,
        ppv
  ))

write_rds(logistic_res, file = logistic01_outputfile)

5 Results

First, let’s check the performance in the resamples and see which model candidates performed best.

5.1 XGB

Let’s have a look at the results:

xgb_res %>% 
  collect_metrics(summarize = TRUE) %>% 
  slice_head(n = 10) %>% 
  gt() %>% 
  fmt_number(where(is.numeric)) 
mtry min_n tree_depth learn_rate loss_reduction sample_size .metric .estimator mean n std_err .config
5.00 18.00 15.00 0.00 0.00 0.42 accuracy binary 0.79 30.00 0.01 Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 f_meas binary NaN 0.00 NA Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 kap binary 0.00 30.00 0.00 Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 ppv binary NaN 0.00 NA Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 precision binary NaN 0.00 NA Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 recall binary 0.00 30.00 0.00 Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 roc_auc binary 0.72 30.00 0.02 Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 sens binary 0.00 30.00 0.00 Preprocessor1_Model01
5.00 18.00 15.00 0.00 0.00 0.42 spec binary 1.00 30.00 0.00 Preprocessor1_Model01
39.00 6.00 5.00 0.00 0.00 0.71 accuracy binary 0.83 30.00 0.01 Preprocessor1_Model02
autoplot(xgb_res)

if (write_to_disk) ggsave(filename = paste0(figs_prefix_path, "xgb_res_cv.pdf"), 
       width = 10, height = 10)

show_best(xgb_res)
## # A tibble: 5 × 12
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1    20    35          5   1.33e-10  0.515               0.347 recall 
## 2    36    37          8   7.10e- 7  0.00000000765       0.137 recall 
## 3    61    17          4   3.15e- 2  0.0196              0.757 recall 
## 4    51     9         11   1.06e- 2  0.000000164         0.904 recall 
## 5    13     5         14   1.44e- 2  0.0000000270        0.436 recall 
## # … with 5 more variables: .estimator <chr>, mean <dbl>, n <int>,
## #   std_err <dbl>, .config <chr>

5.2 RF

rf_res %>% 
  collect_metrics(summarize = TRUE) %>% 
  slice_head(n = 10) %>% 
  gt() %>% 
  fmt_number(where(is.numeric))
mtry min_n .metric .estimator mean n std_err .config
25.00 29.00 accuracy binary 0.66 30.00 0.06 Preprocessor1_Model01
25.00 29.00 f_meas binary NaN 0.00 NA Preprocessor1_Model01
25.00 29.00 kap binary 0.00 19.00 0.00 Preprocessor1_Model01
25.00 29.00 ppv binary NaN 0.00 NA Preprocessor1_Model01
25.00 29.00 precision binary NaN 0.00 NA Preprocessor1_Model01
25.00 29.00 recall binary 0.00 19.00 0.00 Preprocessor1_Model01
25.00 29.00 roc_auc binary 0.50 17.00 0.00 Preprocessor1_Model01
25.00 29.00 sens binary 0.00 19.00 0.00 Preprocessor1_Model01
25.00 29.00 spec binary 1.00 28.00 0.00 Preprocessor1_Model01
28.00 36.00 accuracy binary 0.66 30.00 0.06 Preprocessor1_Model02
autoplot(rf_res)

if (write_to_disk) ggsave(filename = paste0(figs_prefix_path, "rf_res_cv.pdf"), 
       width = 10, height = 10)

show_best(rf_res)
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    12     2 recall  binary     0.158    19  0.0770 Preprocessor1_Model20
## 2    32    11 recall  binary     0.132    19  0.0749 Preprocessor1_Model10
## 3    13     5 recall  binary     0.132    19  0.0749 Preprocessor1_Model23
## 4    10     9 recall  binary     0.105    19  0.0723 Preprocessor1_Model17
## 5     6     7 recall  binary     0.105    19  0.0723 Preprocessor1_Model22

5.3 Logistic Regression

logistic_res %>% 
  collect_metrics(summarize = TRUE) %>% 
  gt() %>% 
  fmt_number(where(is.numeric))
.metric .estimator mean n std_err .config
accuracy binary 0.79 30.00 0.01 Preprocessor1_Model1
f_meas binary 0.38 30.00 0.02 Preprocessor1_Model1
kap binary 0.27 30.00 0.02 Preprocessor1_Model1
ppv binary 0.50 30.00 0.03 Preprocessor1_Model1
precision binary 0.50 30.00 0.03 Preprocessor1_Model1
recall binary 0.33 30.00 0.02 Preprocessor1_Model1
roc_auc binary 0.73 30.00 0.02 Preprocessor1_Model1
sens binary 0.33 30.00 0.02 Preprocessor1_Model1
spec binary 0.92 30.00 0.01 Preprocessor1_Model1
show_best(logistic_res)
## # A tibble: 1 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 recall  binary     0.327    30  0.0210 Preprocessor1_Model1

6 Best tuning parameters

6.1 RF

show_best(rf_res, metric = "recall")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    12     2 recall  binary     0.158    19  0.0770 Preprocessor1_Model20
## 2    32    11 recall  binary     0.132    19  0.0749 Preprocessor1_Model10
## 3    13     5 recall  binary     0.132    19  0.0749 Preprocessor1_Model23
## 4    10     9 recall  binary     0.105    19  0.0723 Preprocessor1_Model17
## 5     6     7 recall  binary     0.105    19  0.0723 Preprocessor1_Model22
select_best(rf_res, metric = "recall")
## # A tibble: 1 × 3
##    mtry min_n .config              
##   <int> <int> <chr>                
## 1    12     2 Preprocessor1_Model20
rf_best_params <-
  tibble(
    mtry = select_best(rf_res, metric = "recall")$mtry[1],
    min_n = select_best(rf_res, metric = "recall")$min_n[1]
  )

6.1.1 Finalize workflow

fin_rf_wf <-
  rf_wf %>% 
  finalize_workflow(rf_best_params)
fin_rf_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## • step_corr()
## • step_impute_knn()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = 12
##   trees = 1000
##   min_n = 2
## 
## Engine-Specific Arguments:
##   num.threads = cores
##   importance = permutation
## 
## Computational engine: ranger

6.1.2 Final fit

final_rf_fit <-
  fin_rf_wf %>% 
  last_fit(d_split,
           metrics = metric_set(
             recall, 
             precision, 
             f_meas, 
             accuracy, 
             kap,
             roc_auc, 
             sens, 
             spec,
             ppv)
  )

final_rf_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [531/134]> train/test split <tibble> <tibble> <tibble>     <workflow>

Check the notes:

final_rf_fit %>% pluck(".notes", 1) %>% pull(3)
## character(0)
if (write_to_disk == TRUE) 
  write_rds(final_rf_fit, 
            file = final_rf_fit_file)

6.1.3 Get performance metrics

final_fit_metrics_rf <- 
  final_rf_fit %>% 
  collect_metrics()

final_fit_metrics_rf
## # A tibble: 9 × 4
##   .metric   .estimator .estimate .config             
##   <chr>     <chr>          <dbl> <chr>               
## 1 recall    binary         0.25  Preprocessor1_Model1
## 2 precision binary         0.7   Preprocessor1_Model1
## 3 f_meas    binary         0.368 Preprocessor1_Model1
## 4 accuracy  binary         0.821 Preprocessor1_Model1
## 5 kap       binary         0.290 Preprocessor1_Model1
## 6 sens      binary         0.25  Preprocessor1_Model1
## 7 spec      binary         0.972 Preprocessor1_Model1
## 8 ppv       binary         0.7   Preprocessor1_Model1
## 9 roc_auc   binary         0.816 Preprocessor1_Model1
final_fit_metrics_rf %>% 
  select(-.config) %>% 
  gt() %>% 
  fmt_number(where(is.numeric))
.metric .estimator .estimate
recall binary 0.25
precision binary 0.70
f_meas binary 0.37
accuracy binary 0.82
kap binary 0.29
sens binary 0.25
spec binary 0.97
ppv binary 0.70
roc_auc binary 0.82
final_rf_fit %>% 
  pluck(".workflow", 1) %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 20)

7 Baseline model for Accuracy

d %>% 
  count(verlauf) %>% 
  mutate(prop = n/sum(n)) %>% 
  gt() %>% 
  fmt_number(3)
verlauf n prop
bad 137 0.21
good 528 0.79